/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    Canopy.java
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SparseInstance;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 * <!-- globalinfo-start --> Cluster data using the capopy clustering algorithm,
 * which requires just one pass over the data. Can run in eitherbatch or
 * incremental mode. Results are generally not as good when running
 * incrementally as the min/max for each numeric attribute is not known in
 * advance. Has a heuristic (based on attribute std. deviations), that can be
 * used in batch mode, for setting the T2 distance. The T2 distance determines
 * how many canopies (clusters) are formed. When the user specifies a specific
 * number (N) of clusters to generate, the algorithm will return the top N
 * canopies (as determined by T2 density) when N &lt; number of canopies (this
 * applies to both batch and incremental learning); when N &gt; number of
 * canopies, the difference is made up by selecting training instances randomly
 * (this can only be done when batch training). For more information see:<br/>
 * <br/>
 * A. McCallum, K. Nigam, L.H. Ungar: Efficient Clustering of High Dimensional
 * Data Sets with Application to Reference Matching. In: Proceedings of the
 * sixth ACM SIGKDD internation conference on knowledge discovery and data
 * mining ACM-SIAM symposium on Discrete algorithms, 169-178, 2000.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- technical-bibtex-start --> BibTeX:
 * 
 * <pre>
 * &#64;inproceedings{McCallum2000,
 *    author = {A. McCallum and K. Nigam and L.H. Ungar},
 *    booktitle = {Proceedings of the sixth ACM SIGKDD internation conference on knowledge discovery and data mining ACM-SIAM symposium on Discrete algorithms},
 *    pages = {169-178},
 *    title = {Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching},
 *    year = {2000}
 * }
 * </pre>
 * <p/>
 * <!-- technical-bibtex-end -->
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 *  -N &lt;num&gt;
 *  Number of clusters.
 *  (default 2).
 * </pre>
 * 
 * <pre>
 *  -max-candidates &lt;num&gt;
 *  Maximum number of candidate canopies to retain in memory
 *  at any one time. T2 distance plus, data characteristics,
 *  will determine how many candidate canopies are formed before
 *  periodic and final pruning are performed, which might result
 *  in exceess memory consumption. This setting avoids large numbers
 *  of candidate canopies consuming memory. (default = 100)
 * </pre>
 * 
 * <pre>
 *  -periodic-pruning &lt;num&gt;
 *  How often to prune low density canopies. 
 *  (default = every 10,000 training instances)
 * </pre>
 * 
 * <pre>
 *  -min-density
 *  Minimum canopy density, below which a canopy will be pruned
 *  during periodic pruning. (default = 2 instances)
 * </pre>
 * 
 * <pre>
 *  -t2
 *  The T2 distance to use. Values &lt; 0 indicate that
 *  a heuristic based on attribute std. deviation should be used to set this.
 *  Note that this heuristic can only be used when batch training
 *  (default = -1.0)
 * </pre>
 * 
 * <pre>
 *  -t1
 *  The T1 distance to use. A value &lt; 0 is taken as a
 *  positive multiplier for T2. (default = -1.5)
 * </pre>
 * 
 * <pre>
 *  -M
 *  Don't replace missing values with mean/mode when running in batch mode.
 * </pre>
 * 
 * <pre>
 *  -S &lt;num&gt;
 *  Random number seed.
 *  (default 1)
 * </pre>
 * 
 * <pre>
 *  -output-debug-info
 *  If set, clusterer is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <pre>
 *  -do-not-check-capabilities
 *  If set, clusterer capabilities are not checked before clusterer is built
 *  (use with caution).
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class Canopy extends RandomizableClusterer implements UpdateableClusterer, NumberOfClustersRequestable, OptionHandler, TechnicalInformationHandler {

    /** For serialization */
    private static final long serialVersionUID = 2067574593448223334L;

    /** The canopy centers */
    protected Instances m_canopies;

    /** The T2 density of each canopy */
    protected List<double[]> m_canopyT2Density;
    protected List<double[][]> m_canopyCenters;
    protected List<double[]> m_canopyNumMissingForNumerics;

    /**
     * The list of canopies that each canopy is a member of (according to the T1
     * radius, which can overlap). Each bit position in the long values corresponds
     * to one canopy. Outer list order corresponds to the order of the instances
     * that store the actual canopy centers
     */
    protected List<long[]> m_clusterCanopies;

    public static final double DEFAULT_T2 = -1.0;
    public static final double DEFAULT_T1 = -1.25;

    /** < 0 means use the heuristic based on std. dev. to set the t2 radius */
    protected double m_userT2 = DEFAULT_T2;

    /**
     * < 0 indicates the multiplier to use for T2 when setting T1, otherwise the
     * value is take as is
     */
    protected double m_userT1 = DEFAULT_T1;

    /** Outer radius */
    protected double m_t1 = m_userT1;

    /** Inner radius */
    protected double m_t2 = m_userT2;

    /**
     * Prune low-density candidate canopies after every x instances have been seen
     */
    protected int m_periodicPruningRate = 10000;

    /**
     * The minimum cluster density (according to T2 distance) allowed. Used when
     * periodically pruning candidate canopies
     */
    protected double m_minClusterDensity = 2;

    /**
     * The maximum number of candidate canopies to hold in memory at any one time
     */
    protected int m_maxCanopyCandidates = 100;

    /**
     * True if the pruning operation did remove at least one low density canopy the
     * last time it was invoked
     */
    protected boolean m_didPruneLastTime = true;

    /** Number of training instances seen so far */
    protected int m_instanceCount;

    /**
     * Default is to let the t2 radius determine how many canopies/clusters are
     * formed
     */
    protected int m_numClustersRequested = -1;

    /**
     * If not null, then this is expected to be a filter that can replace missing
     * values immediately (at training and testing time)
     */
    protected Filter m_missingValuesReplacer;

    /**
     * Replace missing values globally when running in batch mode?
     */
    protected boolean m_dontReplaceMissing = false;

    /** The distance function to use */
    protected NormalizableDistance m_distanceFunction = new EuclideanDistance();

    /**
     * Used to pad out number of cluster centers if fewer canopies are generated
     * than the number of requested clusters and we are running in batch mode.
     */
    protected Instances m_trainingData;

    /**
     * Returns a string describing this clusterer.
     * 
     * @return a description of the evaluator suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "Cluster data using the capopy clustering algorithm, which requires just " + "one pass over the data. Can run in either" + "batch or incremental mode. Results are generally not as good when " + "running incrementally as the min/max for each numeric attribute is not " + "known in advance. Has a heuristic (based on attribute std. deviations), " + "that can be used in batch mode, for setting the T2 distance. The T2 distance " + "determines how many canopies (clusters) are formed. When the user specifies " + "a specific number (N) of clusters to generate, the algorithm will return the " + "top N canopies (as determined by T2 density) when N < number of canopies " + "(this applies to both batch and incremental learning); " + "when N > number of canopies, the difference is made up by selecting training " + "instances randomly (this can only be done when batch training). For more " + "information see:\n\n" + getTechnicalInformation().toString();

    }

    @Override
    public TechnicalInformation getTechnicalInformation() {
        TechnicalInformation result;

        result = new TechnicalInformation(Type.INPROCEEDINGS);
        result.setValue(Field.AUTHOR, "A. McCallum and K. Nigam and L.H. Ungar");
        result.setValue(Field.TITLE, "Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching");
        result.setValue(Field.BOOKTITLE, "Proceedings of the sixth ACM SIGKDD internation conference on " + "knowledge discovery and data mining " + "ACM-SIAM symposium on Discrete algorithms");
        result.setValue(Field.YEAR, "2000");
        result.setValue(Field.PAGES, "169-178");

        return result;
    }

    /**
     * Returns default capabilities of the clusterer.
     * 
     * @return the capabilities of this clusterer
     */
    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();
        result.enable(Capability.NO_CLASS);

        // attributes
        result.enable(Capability.NOMINAL_ATTRIBUTES);
        result.enable(Capability.NUMERIC_ATTRIBUTES);
        result.enable(Capability.MISSING_VALUES);

        return result;
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     */
    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.addElement(new Option("\tNumber of clusters.\n" + "\t(default 2).", "N", 1, "-N <num>"));

        result.addElement(new Option("\tMaximum number of candidate canopies to retain in memory\n\t" + "at any one time. T2 distance plus, data characteristics,\n\t" + "will determine how many candidate canopies are formed before\n\t" + "periodic and final pruning are performed, which might result\n\t" + "in exceess memory consumption. This setting avoids large numbers\n\t" + "of candidate canopies consuming memory. (default = 100)", "-max-candidates", 1, "-max-candidates <num>"));

        result.addElement(new Option("\tHow often to prune low density canopies. \n\t" + "(default = every 10,000 training instances)", "periodic-pruning", 1, "-periodic-pruning <num>"));

        result.addElement(new Option("\tMinimum canopy density, below which a canopy will be pruned\n\t" + "during periodic pruning. (default = 2 instances)", "min-density", 1, "-min-density"));

        result.addElement(new Option("\tThe T2 distance to use. Values < 0 indicate that\n\t" + "a heuristic based on attribute std. deviation should be used to set this.\n\t" + "Note that this heuristic can only be used when batch training\n\t" + "(default = -1.0)", "t2", 1, "-t2"));

        result.addElement(new Option("\tThe T1 distance to use. A value < 0 is taken as a\n\t" + "positive multiplier for T2. (default = -1.5)", "t1", 1, "-t1"));

        result.addElement(new Option("\tDon't replace missing values with mean/mode when " + "running in batch mode.\n", "M", 0, "-M"));

        result.addAll(Collections.list(super.listOptions()));

        return result.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     *  -N &lt;num&gt;
     *  Number of clusters.
     *  (default 2).
     * </pre>
     * 
     * <pre>
     *  -max-candidates &lt;num&gt;
     *  Maximum number of candidate canopies to retain in memory
     *  at any one time. T2 distance plus, data characteristics,
     *  will determine how many candidate canopies are formed before
     *  periodic and final pruning are performed, which might result
     *  in exceess memory consumption. This setting avoids large numbers
     *  of candidate canopies consuming memory. (default = 100)
     * </pre>
     * 
     * <pre>
     *  -periodic-pruning &lt;num&gt;
     *  How often to prune low density canopies. 
     *  (default = every 10,000 training instances)
     * </pre>
     * 
     * <pre>
     *  -min-density
     *  Minimum canopy density, below which a canopy will be pruned
     *  during periodic pruning. (default = 2 instances)
     * </pre>
     * 
     * <pre>
     *  -t2
     *  The T2 distance to use. Values &lt; 0 indicate that
     *  a heuristic based on attribute std. deviation should be used to set this.
     *  Note that this heuristic can only be used when batch training
     *  (default = -1.0)
     * </pre>
     * 
     * <pre>
     *  -t1
     *  The T1 distance to use. A value &lt; 0 is taken as a
     *  positive multiplier for T2. (default = -1.5)
     * </pre>
     * 
     * <pre>
     *  -M
     *  Don't replace missing values with mean/mode when running in batch mode.
     * </pre>
     * 
     * <pre>
     *  -S &lt;num&gt;
     *  Random number seed.
     *  (default 1)
     * </pre>
     * 
     * <pre>
     *  -output-debug-info
     *  If set, clusterer is run in debug mode and
     *  may output additional info to the console
     * </pre>
     * 
     * <pre>
     *  -do-not-check-capabilities
     *  If set, clusterer capabilities are not checked before clusterer is built
     *  (use with caution).
     * </pre>
     * 
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings throws Exception if
     *                an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {

        String temp = Utils.getOption('N', options);
        if (temp.length() > 0) {
            setNumClusters(Integer.parseInt(temp));
        }

        temp = Utils.getOption("max-candidates", options);
        if (temp.length() > 0) {
            setMaxNumCandidateCanopiesToHoldInMemory(Integer.parseInt(temp));
        }

        temp = Utils.getOption("periodic-pruning", options);
        if (temp.length() > 0) {
            setPeriodicPruningRate(Integer.parseInt(temp));
        }

        temp = Utils.getOption("min-density", options);
        if (temp.length() > 0) {
            setMinimumCanopyDensity(Double.parseDouble(temp));
        }

        temp = Utils.getOption("t2", options);
        if (temp.length() > 0) {
            setT2(Double.parseDouble(temp));
        }

        temp = Utils.getOption("t1", options);
        if (temp.length() > 0) {
            setT1(Double.parseDouble(temp));
        }

        setDontReplaceMissingValues(Utils.getFlag('M', options));

        super.setOptions(options);
    }

    /**
     * Gets the current settings of Canopy.
     * 
     * @return an array of strings suitable for passing to setOptions()
     */
    @Override
    public String[] getOptions() {

        Vector<String> result = new Vector<String>();

        result.add("-N");
        result.add("" + getNumClusters());
        result.add("-max-candidates");
        result.add("" + getMaxNumCandidateCanopiesToHoldInMemory());
        result.add("-periodic-pruning");
        result.add("" + getPeriodicPruningRate());
        result.add("-min-density");
        result.add("" + getMinimumCanopyDensity());
        result.add("-t2");
        result.add("" + getT2());
        result.add("-t1");
        result.add("" + getT1());

        if (getDontReplaceMissingValues()) {
            result.add("-M");
        }

        Collections.addAll(result, super.getOptions());

        return result.toArray(new String[result.size()]);
    }

    /**
     * Tests if two sets of canopies have a non-empty intersection
     * 
     * @param first  the first canopy set
     * @param second the second canopy set
     * @return true if the intersection is non-empty
     * @throws Exception if a problem occurs
     */
    public static boolean nonEmptyCanopySetIntersection(long[] first, long[] second) throws Exception {
        if (first.length != second.length) {
            throw new Exception("Canopy lists need to be the same length");
        }

        if (first.length == 0 || second.length == 0) {
            return false;
        }

        for (int i = 0; i < first.length; i++) {
            long firstBlock = first[i];
            long secondBlock = second[i];

            if ((firstBlock & secondBlock) != 0L) {
                return true;
            }
        }

        return false;
    }

    private static void updateCanopyAssignment(long[] assigned, int toAssign) {
        int whichLong = toAssign / 64;
        int whichBitPosition = toAssign % 64;
        long mask = 1L << whichBitPosition;

        assigned[whichLong] |= mask;
    }

    /**
     * Uses T1 distance to assign canopies to the supplied instance. If the instance
     * does not fall within T1 distance of any canopies then the instance has the
     * closest canopy assigned to it.
     * 
     * @param inst the instance to find covering canopies for
     * @return a set of canopies that contain this instance according to T1 distance
     * @throws Exception if a problem occurs
     */
    public long[] assignCanopies(Instance inst) throws Exception {
        if (m_missingValuesReplacer != null) {
            m_missingValuesReplacer.input(inst);
            inst = m_missingValuesReplacer.output();
        }

        int numLongs = m_canopies.size() / 64 + 1;
        long[] assigned = new long[numLongs];

        double minDist = Double.MAX_VALUE;
        double bitsSet = 0;
        int index = -1;
        for (int i = 0; i < m_canopies.numInstances(); i++) {
            double dist = m_distanceFunction.distance(inst, m_canopies.instance(i));
            if (dist < minDist) {
                minDist = dist;
                index = i;
            }

            if (dist < m_t1) {
                updateCanopyAssignment(assigned, i);
                bitsSet++;
                // assigned.add(i);
            }
        }

        // this won't be necessary (for the training data) unless canopies have been
        // pruned due to the user requesting fewer canopies than are created
        // naturally according to the t2 radius
        if (bitsSet == 0) {
            // add the closest canopy
            updateCanopyAssignment(assigned, index);
        }

        return assigned;
    }

    protected void updateCanopyCenter(Instance newInstance, double[][] center, double[] numMissingNumerics) {
        for (int i = 0; i < newInstance.numAttributes(); i++) {
            if (newInstance.attribute(i).isNumeric()) {
                if (center[i].length == 0) {
                    center[i] = new double[1];
                }

                if (!newInstance.isMissing(i)) {
                    center[i][0] += newInstance.value(i);
                } else {
                    numMissingNumerics[i]++;
                }
            } else if (newInstance.attribute(i).isNominal()) {
                if (center[i].length == 0) {
                    // +1 for missing
                    center[i] = new double[newInstance.attribute(i).numValues() + 1];
                }
                if (newInstance.isMissing(i)) {
                    center[i][center[i].length - 1]++;
                } else {
                    center[i][(int) newInstance.value(i)]++;
                }
            }
        }
    }

    @Override
    public void updateClusterer(Instance newInstance) throws Exception {

        if (m_instanceCount > 0) {
            if (m_instanceCount % m_periodicPruningRate == 0) {
                pruneCandidateCanopies();
            }
        }

        m_instanceCount++;
        if (m_missingValuesReplacer != null) {
            m_missingValuesReplacer.input(newInstance);
            newInstance = m_missingValuesReplacer.output();
        }

        m_distanceFunction.update(newInstance);
        boolean addPoint = true;

        for (int i = 0; i < m_canopies.numInstances(); i++) {
            if (m_distanceFunction.distance(newInstance, m_canopies.instance(i)) < m_t2) {
                double[] density = m_canopyT2Density.get(i);
                density[0]++;
                addPoint = false;

                double[][] center = m_canopyCenters.get(i);
                double[] numMissingNumerics = m_canopyNumMissingForNumerics.get(i);
                updateCanopyCenter(newInstance, center, numMissingNumerics);

                break;
            }
        }

        if (addPoint && m_canopies.numInstances() < m_maxCanopyCandidates) {
            m_canopies.add(newInstance);
            double[] density = new double[1];
            density[0] = 1.0;
            m_canopyT2Density.add(density);

            double[][] center = new double[newInstance.numAttributes()][0];
            double[] numMissingNumerics = new double[newInstance.numAttributes()];
            updateCanopyCenter(newInstance, center, numMissingNumerics);
            m_canopyCenters.add(center);
            m_canopyNumMissingForNumerics.add(numMissingNumerics);
        }
    }

    /**
     * Prune low density candidate canopies
     */
    protected void pruneCandidateCanopies() {
        if (m_didPruneLastTime == false && m_canopies.size() == m_maxCanopyCandidates) {
            return;
        }

        m_didPruneLastTime = false;
        for (int i = m_canopies.numInstances() - 1; i >= 0; i--) {
            double dens = m_canopyT2Density.get(i)[0];
            if (dens < m_minClusterDensity) {
                double[] tempDens = m_canopyT2Density.remove(m_canopyT2Density.size() - 1);
                if (i < m_canopyT2Density.size()) {
                    m_canopyT2Density.set(i, tempDens);
                }
                if (getDebug()) {
                    System.err.println("Pruning a candidate canopy with density: " + dens);
                }
                m_didPruneLastTime = true;

                double[][] tempCenter = m_canopyCenters.remove(m_canopyCenters.size() - 1);
                if (i < m_canopyCenters.size()) {
                    m_canopyCenters.set(i, tempCenter);
                }

                double[] tempNumMissingNumerics = m_canopyNumMissingForNumerics.remove(m_canopyNumMissingForNumerics.size() - 1);

                if (i < m_canopyNumMissingForNumerics.size()) {
                    m_canopyNumMissingForNumerics.set(i, tempNumMissingNumerics);
                }

                if (i != m_canopies.numInstances() - 1) {
                    m_canopies.swap(i, m_canopies.numInstances() - 1);
                }
                m_canopies.delete(m_canopies.numInstances() - 1);
            }
        }
    }

    @Override
    public double[] distributionForInstance(Instance instance) throws Exception {
        if (m_canopies == null || m_canopies.size() == 0) {
            throw new Exception("No canopies available to cluster with!");
        }

        double[] d = new double[numberOfClusters()];

        if (m_missingValuesReplacer != null) {
            m_missingValuesReplacer.input(instance);
            instance = m_missingValuesReplacer.output();
        }

        for (int i = 0; i < m_canopies.numInstances(); i++) {
            double distance = m_distanceFunction.distance(instance, m_canopies.instance(i));

            d[i] = 1.0 / (1.0 + distance);
        }

        Utils.normalize(d);
        return d;
    }

    private void assignCanopiesToCanopyCenters() {
        // assign canopies to each canopy center
        m_clusterCanopies = new ArrayList<long[]>();
        for (int i = 0; i < m_canopies.size(); i++) {
            Instance inst = m_canopies.instance(i);
            try {
                long[] assignments = assignCanopies(inst);
                m_clusterCanopies.add(assignments);
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
    }

    /**
     * Adjust the final number of canopies to match the user-requested number (if
     * possible)
     * 
     * @param densities the density of each of the canopies
     */
    protected void adjustCanopies(double[] densities) {
        if (m_numClustersRequested < 0) {
            assignCanopiesToCanopyCenters();

            m_trainingData = new Instances(m_canopies, 0);
            return;
        }

        // more canopies than requested?
        if (m_canopies.numInstances() > m_numClustersRequested) {
            int[] sortedIndexes = Utils.stableSort(densities);

            Instances finalCanopies = new Instances(m_canopies, 0);
            int count = 0;
            for (int i = sortedIndexes.length - 1; count < m_numClustersRequested; i--) {
                finalCanopies.add(m_canopies.instance(sortedIndexes[i]));
                count++;
            }

            m_canopies = finalCanopies;
            List<double[][]> tempCanopyCenters = new ArrayList<double[][]>();
            List<double[]> tempT2Dists = new ArrayList<double[]>();
            List<double[]> tempMissings = new ArrayList<double[]>();

            // make sure that the center sums, densities and missing counts are
            // aligned with the new canopy list
            count = 0;
            for (int i = sortedIndexes.length - 1; count < finalCanopies.numInstances(); i--) {
                tempCanopyCenters.add(m_canopyCenters.get(sortedIndexes[i]));
                tempT2Dists.add(m_canopyT2Density.get(sortedIndexes[i]));
                tempMissings.add(m_canopyNumMissingForNumerics.get(sortedIndexes[i]));
                count++;
            }
            m_canopyCenters = tempCanopyCenters;
            m_canopyT2Density = tempT2Dists;
            m_canopyNumMissingForNumerics = tempMissings;

        } else if (m_canopies.numInstances() < m_numClustersRequested && m_trainingData != null && m_trainingData.numInstances() > 0) {

            // make up the difference with randomly selected instances (if possible)
            Random r = new Random(getSeed());
            for (int i = 0; i < 10; i++) {
                r.nextInt();
            }
            HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>();
            DecisionTableHashKey hk = null;

            // put the existing canopies in the lookup
            for (int i = 0; i < m_canopies.numInstances(); i++) {
                try {
                    hk = new DecisionTableHashKey(m_canopies.instance(i), m_canopies.numAttributes(), true);

                    initC.put(hk, null);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            for (int j = m_trainingData.numInstances() - 1; j >= 0; j--) {
                int instIndex = r.nextInt(j + 1);
                try {
                    hk = new DecisionTableHashKey(m_trainingData.instance(instIndex), m_trainingData.numAttributes(), true);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                if (!initC.containsKey(hk)) {
                    Instance newInstance = m_trainingData.instance(instIndex);
                    m_canopies.add(newInstance);

                    double[] density = new double[1];
                    density[0] = 1.0;
                    m_canopyT2Density.add(density);

                    double[][] center = new double[newInstance.numAttributes()][0];
                    double[] numMissingNumerics = new double[newInstance.numAttributes()];
                    updateCanopyCenter(newInstance, center, numMissingNumerics);
                    m_canopyCenters.add(center);
                    m_canopyNumMissingForNumerics.add(numMissingNumerics);

                    initC.put(hk, null);
                }
                m_trainingData.swap(j, instIndex);

                if (m_canopies.numInstances() == m_numClustersRequested) {
                    break;
                }
            }
        }

        assignCanopiesToCanopyCenters();

        // save memory
        m_trainingData = new Instances(m_canopies, 0);
    }

    @Override
    public void updateFinished() {
        if (m_canopies == null || m_canopies.numInstances() == 0) {
            return;
        }

        pruneCandidateCanopies();

        // set the final canopy centers and weights
        double[] densities = new double[m_canopies.size()];
        for (int i = 0; i < m_canopies.numInstances(); i++) {
            double[] density = m_canopyT2Density.get(i);
            double[][] centerSums = m_canopyCenters.get(i);
            double[] numMissingForNumerics = m_canopyNumMissingForNumerics.get(i);
            double[] finalCenter = new double[m_canopies.numAttributes()];
            for (int j = 0; j < m_canopies.numAttributes(); j++) {
                if (m_canopies.attribute(j).isNumeric()) {
                    if (numMissingForNumerics[j] == density[0]) {
                        finalCenter[j] = Utils.missingValue();
                    } else {
                        finalCenter[j] = centerSums[j][0] / (density[0] - numMissingForNumerics[j]);
                    }
                } else if (m_canopies.attribute(j).isNominal()) {
                    int mode = Utils.maxIndex(centerSums[j]);
                    if (mode == centerSums[j].length - 1) {
                        finalCenter[j] = Utils.missingValue();
                    } else {
                        finalCenter[j] = mode;
                    }
                }
            }

            Instance finalCenterInst = m_canopies.instance(i) instanceof SparseInstance ? new SparseInstance(1.0, finalCenter) : new DenseInstance(1.0, finalCenter);
            m_canopies.set(i, finalCenterInst);

            m_canopies.instance(i).setWeight(density[0]);
            densities[i] = density[0];
        }

        adjustCanopies(densities);
    }

    /**
     * Initialize the distance function (i.e set min/max values for numeric
     * attributes) with the supplied instances.
     * 
     * @param init the instances to initialize with
     * @throws Exception if a problem occurs
     */
    public void initializeDistanceFunction(Instances init) throws Exception {
        if (m_missingValuesReplacer != null) {
            init = Filter.useFilter(init, m_missingValuesReplacer);
        }

        m_distanceFunction.setInstances(init);
    }

    /**
     * Pretty hokey heuristic to try and set t2 distance automatically based on
     * standard deviation
     * 
     * @param trainingBatch the training instances
     * @throws Exception if a problem occurs
     */
    protected void setT2T1BasedOnStdDev(Instances trainingBatch) throws Exception {
        double normalizedStdDevSum = 0;

        for (int i = 0; i < trainingBatch.numAttributes(); i++) {
            if (trainingBatch.attribute(i).isNominal()) {
                normalizedStdDevSum += 0.25;
            } else if (trainingBatch.attribute(i).isNumeric()) {
                AttributeStats stats = trainingBatch.attributeStats(i);
                if (trainingBatch.numInstances() - stats.missingCount > 2) {
                    double stdDev = stats.numericStats.stdDev;
                    double min = stats.numericStats.min;
                    double max = stats.numericStats.max;
                    if (!Utils.isMissingValue(stdDev) && max - min > 0) {
                        stdDev = 0.5 * stdDev / (max - min);
                        normalizedStdDevSum += stdDev;
                    }
                }
            }
        }

        normalizedStdDevSum = Math.sqrt(normalizedStdDevSum);
        if (normalizedStdDevSum > 0) {
            m_t2 = normalizedStdDevSum;
        }
    }

    @Override
    public void buildClusterer(Instances data) throws Exception {
        m_t1 = m_userT1;
        m_t2 = m_userT2;

        if (data.numInstances() == 0 && m_userT2 < 0) {
            System.err.println("The heuristic for setting T2 based on std. dev. can't be used when " + "running in incremental mode. Using default of 1.0.");
            m_t2 = 1.0;
        }

        m_canopyT2Density = new ArrayList<double[]>();
        m_canopyCenters = new ArrayList<double[][]>();
        m_canopyNumMissingForNumerics = new ArrayList<double[]>();

        if (data.numInstances() > 0) {
            if (!m_dontReplaceMissing) {
                m_missingValuesReplacer = new ReplaceMissingValues();
                m_missingValuesReplacer.setInputFormat(data);
                data = Filter.useFilter(data, m_missingValuesReplacer);
            }
            Random r = new Random(getSeed());
            for (int i = 0; i < 10; i++) {
                r.nextInt();
            }
            data.randomize(r);

            if (m_userT2 < 0) {
                setT2T1BasedOnStdDev(data);
            }
        }
        m_t1 = m_userT1 > 0 ? m_userT1 : -m_userT1 * m_t2;
        // if (m_t1 < m_t2) {
        // throw new Exception("T1 can't be less than T2. Computed T2 as " + m_t2
        // + " T1 is requested to be " + m_t1);
        // }

        m_distanceFunction.setInstances(data);

        m_canopies = new Instances(data, 0);
        if (data.numInstances() > 0) {
            m_trainingData = new Instances(data);
        }

        for (int i = 0; i < data.numInstances(); i++) {
            if (getDebug() && i % m_periodicPruningRate == 0) {
                System.err.println("Processed: " + i);
            }
            updateClusterer(data.instance(i));
        }

        updateFinished();
    }

    @Override
    public int numberOfClusters() throws Exception {
        return m_canopies.numInstances();
    }

    /**
     * Set a ready-to-use missing values replacement filter
     * 
     * @param missingReplacer the missing values replacement filter to use
     */
    public void setMissingValuesReplacer(Filter missingReplacer) {
        m_missingValuesReplacer = missingReplacer;
    }

    /**
     * Get the canopies (cluster centers).
     * 
     * @return the canopies
     */
    public Instances getCanopies() {
        return m_canopies;
    }

    /**
     * Set the canopies to use (replaces any learned by this clusterer already)
     * 
     * @param canopies the canopies to use
     */
    public void setCanopies(Instances canopies) {
        m_canopies = canopies;
    }

    /**
     * Get the canopies that each canopy (cluster center) is within T1 distance of
     * 
     * @return a list of canopies for each cluster center
     */
    public List<long[]> getClusterCanopyAssignments() {
        return m_clusterCanopies;
    }

    /**
     * Set the canopies that each canopy (cluster center) is within T1 distance of
     * 
     * @param clusterCanopies the list canopies for each cluster center
     */
    public void setClusterCanopyAssignments(List<long[]> clusterCanopies) {
        m_clusterCanopies = clusterCanopies;
    }

    /**
     * Get the actual value of T2 (which may be different from the initial value if
     * the heuristic is used)
     * 
     * @return the actual value of T2
     */
    public double getActualT2() {
        return m_t2;
    }

    /**
     * Get the actual value of T1 (which may be different from the initial value if
     * the heuristic is used)
     * 
     * @return the actual value of T1
     */
    public double getActualT1() {
        return m_t1;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String t1TipText() {
        return "The T1 distance to use. Values < 0 are taken as a positive " + "multiplier for the T2 distance";
    }

    /**
     * Set the T1 distance. Values < 0 are taken as a positive multiplier for the T2
     * distance - e.g. T1_actual = Math.abs(t1) * t2;
     * 
     * @param t1 the T1 distance to use
     */
    public void setT1(double t1) {
        m_userT1 = t1;
    }

    /**
     * Get the T1 distance. Values < 0 are taken as a positive multiplier for the T2
     * distance - e.g. T1_actual = Math.abs(t1) * t2;
     * 
     * @return the T1 distance to use
     */
    public double getT1() {
        return m_userT1;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String t2TipText() {
        return "The T2 distance to use. Values < 0 indicate that this should be set using " + "a heuristic based on attribute standard deviation (note that this only" + "works when batch training)";
    }

    /**
     * Set the T2 distance to use. Values < 0 indicate that a heuristic based on
     * attribute standard deviation should be used to set this (note that the
     * heuristic is only applicable when batch training).
     * 
     * @param t2 the T2 distance to use
     */
    public void setT2(double t2) {
        m_userT2 = t2;
    }

    /**
     * Get the T2 distance to use. Values < 0 indicate that a heuristic based on
     * attribute standard deviation should be used to set this (note that the
     * heuristic is only applicable when batch training).
     * 
     * @return the T2 distance to use
     */
    public double getT2() {
        return m_userT2;
    }

    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String numClustersTipText() {
        return "Set number of clusters. -1 means number of clusters is determined by " + "T2 distance";
    }

    @Override
    public void setNumClusters(int numClusters) throws Exception {
        m_numClustersRequested = numClusters;
    }

    /**
     * Get the number of clusters to generate
     * 
     * @return the number of clusters to generate
     */
    public int getNumClusters() {
        return m_numClustersRequested;
    }

    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String periodicPruningRateTipText() {
        return "How often to prune low density canopies during training";
    }

    /**
     * Set the how often to prune low density canopies during training
     * 
     * @param p how often (every p instances) to prune low density canopies
     */
    public void setPeriodicPruningRate(int p) {
        m_periodicPruningRate = p;
    }

    /**
     * Get the how often to prune low density canopies during training
     * 
     * @return how often (every p instances) to prune low density canopies
     */
    public int getPeriodicPruningRate() {
        return m_periodicPruningRate;
    }

    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String minimumCanopyDensityTipText() {
        return "The minimum T2-based density below which a canopy will be pruned during periodic pruning";
    }

    /**
     * Set the minimum T2-based density below which a canopy will be pruned during
     * periodic pruning.
     * 
     * @param dens the minimum canopy density
     */
    public void setMinimumCanopyDensity(double dens) {
        m_minClusterDensity = dens;
    }

    /**
     * Get the minimum T2-based density below which a canopy will be pruned during
     * periodic pruning.
     * 
     * @return the minimum canopy density
     */
    public double getMinimumCanopyDensity() {
        return m_minClusterDensity;
    }

    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String maxNumCandidateCanopiesToHoldInMemory() {
        return "The maximum number of candidate canopies to retain in main memory during training. " + "T2 distance and data characteristics determine how many candidate " + "canopies are formed before periodic and final pruning are performed. There " + "may not be enough memory available if T2 is set too low.";
    }

    /**
     * Set the maximum number of candidate canopies to retain in memory during
     * training. T2 distance and data characteristics determine how many candidate
     * canopies are formed before periodic and final pruning are performed. There
     * may not be enough memory available if T2 is set too low.
     * 
     * @param max the maximum number of candidate canopies to retain in memory
     *            during training
     */
    public void setMaxNumCandidateCanopiesToHoldInMemory(int max) {
        m_maxCanopyCandidates = max;
    }

    /**
     * Get the maximum number of candidate canopies to retain in memory during
     * training. T2 distance and data characteristics determine how many candidate
     * canopies are formed before periodic and final pruning are performed. There
     * may not be enough memory available if T2 is set too low.
     * 
     * @return the maximum number of candidate canopies to retain in memory during
     *         training
     */
    public int getMaxNumCandidateCanopiesToHoldInMemory() {
        return m_maxCanopyCandidates;
    }

    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String dontReplaceMissingValuesTipText() {
        return "Replace missing values globally with mean/mode.";
    }

    /**
     * Sets whether missing values are to be replaced.
     * 
     * @param r true if missing values are to be replaced
     */
    public void setDontReplaceMissingValues(boolean r) {
        m_dontReplaceMissing = r;
    }

    /**
     * Gets whether missing values are to be replaced.
     * 
     * @return true if missing values are to be replaced
     */
    public boolean getDontReplaceMissingValues() {
        return m_dontReplaceMissing;
    }

    public static String printSingleAssignment(long[] assignments) {
        StringBuilder temp = new StringBuilder();

        boolean first = true;
        temp.append(" <");
        for (int j = 0; j < assignments.length; j++) {
            long block = assignments[j];
            int offset = j * 64;

            for (int k = 0; k < 64; k++) {
                long mask = 1L << k;

                if ((mask & block) != 0L) {
                    temp.append("" + (!first ? "," : "") + (offset + k));
                    if (first) {
                        first = false;
                    }
                }
            }
        }

        temp.append(">");
        return temp.toString();
    }

    /**
     * Print the supplied instances and their canopies
     * 
     * @param dataPoints        the instances to print
     * @param canopyAssignments the canopy assignments, one assignment array for
     *                          each instance
     * @return a string containing the printed assignments
     */
    public static String printCanopyAssignments(Instances dataPoints, List<long[]> canopyAssignments) {
        StringBuilder temp = new StringBuilder();

        for (int i = 0; i < dataPoints.size(); i++) {
            temp.append("Cluster " + i + ": ");
            temp.append(dataPoints.instance(i));
            if (canopyAssignments != null && canopyAssignments.size() == dataPoints.size()) {
                long[] assignments = canopyAssignments.get(i);
                temp.append(printSingleAssignment(assignments));
            }
            temp.append("\n");
        }

        return temp.toString();
    }

    /**
     * Return a textual description of this clusterer
     * 
     * @param header true if the header should be printed
     * @return a string describing the result of the clustering
     */
    public String toString(boolean header) {
        StringBuffer temp = new StringBuffer();

        if (m_canopies == null) {
            return "No clusterer built yet";
        }

        if (header) {
            temp.append("\nCanopy clustering\n=================\n");
            temp.append("\nNumber of canopies (cluster centers) found: " + m_canopies.numInstances());
        }

        temp.append("\nT2 radius: " + String.format("%-10.3f", m_t2));
        temp.append("\nT1 radius: " + String.format("%-10.3f", m_t1));
        temp.append("\n\n");

        temp.append(printCanopyAssignments(m_canopies, m_clusterCanopies));

        temp.append("\n");

        return temp.toString();
    }

    @Override
    public String toString() {
        return toString(true);
    }

    /**
     * Save memory
     */
    public void cleanUp() {
        m_canopyNumMissingForNumerics = null;
        m_canopyT2Density = null;
        m_canopyCenters = null;
    }

    /**
     * Aggregate the canopies from a list of Canopy clusterers together into one
     * final model.
     * 
     * @param canopies              the list of Canopy clusterers to aggregate
     * @param aggregationT1         the T1 distance to use for the aggregated
     *                              classifier
     * @param aggregationT2         the T2 distance to use when aggregating canopies
     * @param finalDistanceFunction the distance function to use with the final
     *                              Canopy clusterer
     * @param missingValuesReplacer the missing value replacement filter to use with
     *                              the final clusterer (can be null for no missing
     *                              value replacement)
     * @param finalNumCanopies      the final number of canopies
     * @return a Canopy clusterer that aggregates all the canopies
     */
    public static Canopy aggregateCanopies(List<Canopy> canopies, double aggregationT1, double aggregationT2, NormalizableDistance finalDistanceFunction, Filter missingValuesReplacer, int finalNumCanopies) {

        Instances collectedCanopies = new Instances(canopies.get(0).getCanopies(), 0);
        Instances finalCanopies = new Instances(collectedCanopies, 0);

        List<double[][]> finalCenters = new ArrayList<double[][]>();
        List<double[]> finalMissingNumerics = new ArrayList<double[]>();
        List<double[]> finalT2Densities = new ArrayList<double[]>();
        List<Instance> finalCanopiesList = new ArrayList<Instance>();
        List<double[][]> centersForEachCanopy = new ArrayList<double[][]>();
        List<double[]> numMissingNumericsForEachCanopy = new ArrayList<double[]>();

        for (Canopy c : canopies) {
            Instances tempC = c.getCanopies();
            // System.err.println("A canopy clusterer:\n " + c.toString());
            for (int i = 0; i < tempC.numInstances(); i++) {
                collectedCanopies.add(tempC.instance(i));
                centersForEachCanopy.add(c.m_canopyCenters.get(i));
                numMissingNumericsForEachCanopy.add(c.m_canopyNumMissingForNumerics.get(i));
            }
        }

        for (int i = 0; i < collectedCanopies.numInstances(); i++) {
            boolean addPoint = true;
            Instance candidate = collectedCanopies.instance(i);
            double[][] candidateCenter = centersForEachCanopy.get(i);
            double[] candidateMissingNumerics = numMissingNumericsForEachCanopy.get(i);

            for (int j = 0; j < finalCanopiesList.size(); j++) {
                Instance fc = finalCanopiesList.get(j);

                if (finalDistanceFunction.distance(candidate, fc) < aggregationT2) {
                    addPoint = false;

                    // now absorb candidate into fc
                    double[][] center = finalCenters.get(j);
                    double[] missingNumerics = finalMissingNumerics.get(j);
                    // double newDensity = fc.weight() + candidate.weight();
                    finalT2Densities.get(j)[0] += candidate.weight();

                    for (int k = 0; k < candidate.numAttributes(); k++) {
                        missingNumerics[k] += candidateMissingNumerics[k];
                        for (int l = 0; l < center[k].length; l++) {
                            center[k][l] += candidateCenter[k][l];
                        }
                    }

                    break;
                }
            }

            if (addPoint) {
                finalCanopiesList.add(candidate);
                finalCanopies.add(candidate);
                finalCenters.add(candidateCenter);
                finalMissingNumerics.add(candidateMissingNumerics);
                double[] dens = new double[1];
                dens[0] = candidate.weight();
                finalT2Densities.add(dens);
            }
        }

        // now construct a new Canopy encapsulating the final set of canopies
        // System.err.println(finalCanopies);
        Canopy finalC = new Canopy();
        finalC.setCanopies(finalCanopies);
        finalC.setMissingValuesReplacer(missingValuesReplacer);
        finalC.m_distanceFunction = finalDistanceFunction;
        finalC.m_canopyCenters = finalCenters;
        finalC.m_canopyNumMissingForNumerics = finalMissingNumerics;
        finalC.m_canopyT2Density = finalT2Densities;
        finalC.m_t2 = aggregationT2;
        finalC.m_t1 = aggregationT1;
        try {
            finalC.setNumClusters(finalNumCanopies);
        } catch (Exception e) {
            // can safely ignore as Canopy does not generate an exception
        }

        finalC.updateFinished();

        return finalC;
    }

    public static void main(String[] args) {
        runClusterer(new Canopy(), args);
    }

}
